Project Overview

This project involves the analysis and clustering of a wine dataset adopted from Kaggle, containing 13 variables including alcohol, malic acid, and other chemical properties of wines. The dataset does not include any outcome variable, making it ideal for unsupervised learning techniques. The primary goal of this project is to explore the underlying patterns within the data by applying clustering algorithms such as k-means and hierarchical clustering. Additionally, Principal Component Analysis (PCA) will be performed to reduce the dimensionality of the data and to compare the performance and insights derived from these clustering methods.

# Load the libraries
library(tidyverse)
library(tidymodels)
library(tidyclust)
# Load the data
wine <- read_csv("wine-clustering.csv")
glimpse(wine)
## Rows: 178
## Columns: 13
## $ Alcohol              <dbl> 14.23, 13.20, 13.16, 14.37, 13.24, 14.20, 14.39, …
## $ Malic_Acid           <dbl> 1.71, 1.78, 2.36, 1.95, 2.59, 1.76, 1.87, 2.15, 1…
## $ Ash                  <dbl> 2.43, 2.14, 2.67, 2.50, 2.87, 2.45, 2.45, 2.61, 2…
## $ Ash_Alcanity         <dbl> 15.6, 11.2, 18.6, 16.8, 21.0, 15.2, 14.6, 17.6, 1…
## $ Magnesium            <dbl> 127, 100, 101, 113, 118, 112, 96, 121, 97, 98, 10…
## $ Total_Phenols        <dbl> 2.80, 2.65, 2.80, 3.85, 2.80, 3.27, 2.50, 2.60, 2…
## $ Flavanoids           <dbl> 3.06, 2.76, 3.24, 3.49, 2.69, 3.39, 2.52, 2.51, 2…
## $ Nonflavanoid_Phenols <dbl> 0.28, 0.26, 0.30, 0.24, 0.39, 0.34, 0.30, 0.31, 0…
## $ Proanthocyanins      <dbl> 2.29, 1.28, 2.81, 2.18, 1.82, 1.97, 1.98, 1.25, 1…
## $ Color_Intensity      <dbl> 5.64, 4.38, 5.68, 7.80, 4.32, 6.75, 5.25, 5.05, 5…
## $ Hue                  <dbl> 1.04, 1.05, 1.03, 0.86, 1.04, 1.05, 1.02, 1.06, 1…
## $ OD280                <dbl> 3.92, 3.40, 3.17, 3.45, 2.93, 2.85, 3.58, 3.58, 2…
## $ Proline              <dbl> 1065, 1050, 1185, 1480, 735, 1450, 1290, 1295, 10…

EDA

library(skimr)
skim(wine)
# There are no missing values in the data set

Distribution of different variables

wine %>% 
  pivot_longer(everything(), names_to = "Variable", values_to = "Value") %>% 
  ggplot(aes(Value, fill = Variable)) +
  geom_histogram() +
  facet_wrap(~Variable, scales = "free_x")

# Majority of the variables are right skewed

Correlation analysis

library(ggcorrplot)

cor_matrix <- cor(wine, use = "complete.obs")

ggcorrplot(cor_matrix, method = "square", 
           type = "full", lab = TRUE, lab_size = 3, tl.cex = 10)

There are a few highly correlated variables, for example; alcohol and proline , Flavanoids and OD280

Data Preprocessing

Train and test sets

set.seed(123)
wine_splits <- initial_split(wine, prop = 0.75)
wine_train <- training(wine_splits)
wine_test <- testing(wine_splits)

Recipes

basic_rec <- recipe(~., data = wine_train) %>% 
  step_zv(all_predictors()) %>% 
  step_normalize(all_numeric_predictors())

prep(basic_rec)

# Add a principal component analysis step because of the highly correlated variables
pca_rec <- basic_rec %>% 
  step_pca(all_numeric_predictors(), num_comp = 4)

Kmeans Clustering

Model Specification

  • Using a kmeans clustering algorithm - It classifies the observations into a set of k groups such that observations within the same cluster are as similar as possible where as observations from different clusters are as dissimilar as possible
kmeans_spec <- k_means(num_clusters = tune()) %>% 
  set_engine("stats")

Workflow

kmeans_wf <-workflow() %>% 
  add_recipe(basic_rec) %>% 
  add_model(kmeans_spec)

Tuning

# First create cross validation folds
set.seed(4567)
wine_folds <- vfold_cv(wine_train, v = 10)

# Create a grid
grid <- tibble(num_clusters = 1:10)

wine_res <- tune_cluster(
  kmeans_wf,
  resamples = wine_folds,
  grid = grid,
  metrics = cluster_metric_set(silhouette_avg)
)

wine_res
## # Tuning results
## # 10-fold cross-validation 
## # A tibble: 10 × 4
##    splits           id     .metrics          .notes          
##    <list>           <chr>  <list>            <list>          
##  1 <split [119/14]> Fold01 <tibble [10 × 5]> <tibble [0 × 3]>
##  2 <split [119/14]> Fold02 <tibble [10 × 5]> <tibble [0 × 3]>
##  3 <split [119/14]> Fold03 <tibble [10 × 5]> <tibble [0 × 3]>
##  4 <split [120/13]> Fold04 <tibble [10 × 5]> <tibble [0 × 3]>
##  5 <split [120/13]> Fold05 <tibble [10 × 5]> <tibble [0 × 3]>
##  6 <split [120/13]> Fold06 <tibble [10 × 5]> <tibble [0 × 3]>
##  7 <split [120/13]> Fold07 <tibble [10 × 5]> <tibble [0 × 3]>
##  8 <split [120/13]> Fold08 <tibble [10 × 5]> <tibble [0 × 3]>
##  9 <split [120/13]> Fold09 <tibble [10 × 5]> <tibble [0 × 3]>
## 10 <split [120/13]> Fold10 <tibble [10 × 5]> <tibble [0 × 3]>
wine_res %>% 
  collect_metrics()
## # A tibble: 10 × 7
##    num_clusters .metric        .estimator    mean     n  std_err .config        
##           <int> <chr>          <chr>        <dbl> <int>    <dbl> <chr>          
##  1            1 silhouette_avg standard   NaN         0 NA       Preprocessor1_…
##  2            2 silhouette_avg standard     0.266    10  0.00242 Preprocessor1_…
##  3            3 silhouette_avg standard     0.280    10  0.00210 Preprocessor1_…
##  4            4 silhouette_avg standard     0.258    10  0.00555 Preprocessor1_…
##  5            5 silhouette_avg standard     0.219    10  0.00851 Preprocessor1_…
##  6            6 silhouette_avg standard     0.198    10  0.00776 Preprocessor1_…
##  7            7 silhouette_avg standard     0.172    10  0.0101  Preprocessor1_…
##  8            8 silhouette_avg standard     0.169    10  0.00561 Preprocessor1_…
##  9            9 silhouette_avg standard     0.163    10  0.00755 Preprocessor1_…
## 10           10 silhouette_avg standard     0.155    10  0.00344 Preprocessor1_…
# Visualizing the results
wine_res %>% 
  collect_metrics() %>% 
  select(num_clusters, mean) %>% 
  ggplot(aes(x = num_clusters, y = mean)) +
  geom_point() +
  geom_line() +
  theme_minimal() +
  ylab("mean silhoutte score") +
  xlab("Number of clusters") +
  scale_x_continuous(breaks = 1:10) 

Based on the tuning results, the highest silhouette average obtained is 0.28, which corresponds to 3 clusters, indicating that 3 is the optimal number of clusters for the model.

# Select the best cluster and finalize the workflow

best_cluster <- tibble(num_clusters = 3)

wines_final_wf <- finalize_workflow_tidyclust(kmeans_wf, best_cluster)
wines_final_wf
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: k_means()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
## 
## • step_zv()
## • step_normalize()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## K Means Cluster Specification (partition)
## 
## Main Arguments:
##   num_clusters = 3
## 
## Computational engine: stats
# Fit the model
wines_fit <- fit(wines_final_wf, data = wine_train)
wines_fit
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: k_means()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
## 
## • step_zv()
## • step_normalize()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## K-means clustering with 3 clusters of sizes 38, 50, 45
## 
## Cluster means:
##      Alcohol Malic_Acid        Ash Ash_Alcanity   Magnesium Total_Phenols
## 1  0.1161905  0.8995691  0.2483876   0.58560917 -0.06356151    -0.9147589
## 2  0.7584965 -0.3156888  0.3242022  -0.53313502  0.53103730     0.9118989
## 3 -0.9408903 -0.4088708 -0.5699742   0.09785784 -0.53636728    -0.2407579
##    Flavanoids Nonflavanoid_Phenols Proanthocyanins Color_Intensity        Hue
## 1 -1.19945157           0.75676445      -0.6915342      0.97607424 -1.1570383
## 2  0.97796390          -0.65445335       0.6213731      0.07029791  0.5518514
## 3 -0.07375634           0.08812485      -0.1064523     -0.90234926  0.3638863
##        OD280    Proline
## 1 -1.2471254 -0.3467537
## 2  0.7745446  1.0042800
## 3  0.1925231 -0.8230524
## 
## Clustering vector:
##   [1] 1 2 1 2 3 2 1 1 1 3 3 1 3 1 2 2 2 2 1 3 3 1 3 3 3 1 2 3 1 1 2 2 2 1 1 2 3
##  [38] 1 1 3 3 1 2 2 2 1 3 2 3 3 2 3 1 3 3 2 2 3 1 1 1 3 1 2 2 1 2 2 2 3 1 3 3 2
##  [75] 2 2 2 2 1 3 1 3 2 3 3 1 3 3 2 2 2 1 3 3 1 2 2 3 3 2 3 2 1 1 1 3 1 2 2 2 1
## [112] 3 3 1 2 2 2 2 3 2 3 3 1 1 2 3 3 2 3 2 2 2 3
## 
## Within cluster sum of squares by cluster:
## [1] 237.4795 347.6556 359.7687
##  (between_SS / total_SS =  44.9 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
extract_cluster_assignment(wines_fit) %>% table()   # returns the cluster assignments of                                                   the training observations
## .cluster
## Cluster_1 Cluster_2 Cluster_3 
##        38        50        45

Most observations are in the second cluster

extract_centroids(wines_fit)   #returns the location of the centroids
## # A tibble: 3 × 14
##   .cluster  Alcohol Malic_Acid    Ash Ash_Alcanity Magnesium Total_Phenols
##   <fct>       <dbl>      <dbl>  <dbl>        <dbl>     <dbl>         <dbl>
## 1 Cluster_1   0.116      0.900  0.248       0.586    -0.0636        -0.915
## 2 Cluster_2   0.758     -0.316  0.324      -0.533     0.531          0.912
## 3 Cluster_3  -0.941     -0.409 -0.570       0.0979   -0.536         -0.241
## # ℹ 7 more variables: Flavanoids <dbl>, Nonflavanoid_Phenols <dbl>,
## #   Proanthocyanins <dbl>, Color_Intensity <dbl>, Hue <dbl>, OD280 <dbl>,
## #   Proline <dbl>

Visualize the clusters

library(factoextra)

# fviz_cluster() doesn't accept a workflow object, thus the need to extract the fitted      k-means model from the workflow object before passing it to fviz_cluster()

# extract_fit_parsnip - Extracts the fitted k-means model from the workflow object.
kmeans_model <- extract_fit_parsnip(wines_fit)$fit

fviz_cluster(kmeans_model, data = prep(basic_rec) %>% bake(new_data = NULL),
             geom = "point",
             ellipse.type = "convex",
             palette = "jco",
             ggtheme = theme_minimal())

An interactive visualization of clusters in the wine dataset using the variables Alcohol and Malic_Acid.

p <- wine_train %>% 
  bind_cols(extract_cluster_assignment(wines_fit), .) %>% 
  ggplot(aes(Alcohol, Malic_Acid)) +
  geom_point(aes(fill = .cluster),
             shape = 21,
             alpha = 0.5,
             size = 5) +
  geom_smooth(color = "blue", se = FALSE) +
  scale_y_log10() +
  scale_x_log10()

library(plotly)
ggplotly(p)

To make predictions using the fitted model, use predict()

# Making some predictions
predict(wines_fit, new_data = slice_sample(wine_train, n = 10))   # returns the cluster a new                                                                 observation belongs to
## # A tibble: 10 × 1
##    .pred_cluster
##    <fct>        
##  1 Cluster_1    
##  2 Cluster_2    
##  3 Cluster_1    
##  4 Cluster_1    
##  5 Cluster_2    
##  6 Cluster_2    
##  7 Cluster_3    
##  8 Cluster_3    
##  9 Cluster_1    
## 10 Cluster_2
# Fit the model on the test data to assess performance on new data
wines_pred <- predict(wines_fit, new_data = wine_test)
wines_pred
## # A tibble: 45 × 1
##    .pred_cluster
##    <fct>        
##  1 Cluster_2    
##  2 Cluster_2    
##  3 Cluster_2    
##  4 Cluster_2    
##  5 Cluster_2    
##  6 Cluster_2    
##  7 Cluster_2    
##  8 Cluster_2    
##  9 Cluster_2    
## 10 Cluster_2    
## # ℹ 35 more rows
  • After fitting the k-means model, the silhouette average score achieved was 0.28, indicating moderate cluster separation.
  • A silhouette score of 0.285 suggests that the clustering is not very strong, and there may be some degree of overlap or ambiguity in the cluster assignments.
  • Can be improved by reviewing the number of clusters, employing PCA, or using alternative clustering algorithms
  • Step 1: Fit a Hierarchical model to explore whether it improves clustering performance

Hierarchical Clustering (Agglomerative Clustering)

  • It is an algorithm that creates a hierarchy of clusters, does not require the pre-specification of the number of clusters
  • Agglomerative clustering is a type of hierarchical clustering that works in a bottom up manner
  • It starts with each data point as its own cluster and gradually merges them based on some measure of similarity or distance, such as Euclidean distance

Model specification

hc_spec <- hier_clust(num_clusters = tune(),
                      linkage_method = "ward.D2") %>% 
  set_engine("stats")

hc_spec
## Hierarchical Clustering Specification (partition)
## 
## Main Arguments:
##   num_clusters = tune()
##   linkage_method = ward.D2
## 
## Computational engine: stats

Workflow

hc_wf <- workflow() %>% 
  add_recipe(basic_rec) %>% 
  add_model(hc_spec)

hc_wf
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: hier_clust()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
## 
## • step_zv()
## • step_normalize()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Hierarchical Clustering Specification (partition)
## 
## Main Arguments:
##   num_clusters = tune()
##   linkage_method = ward.D2
## 
## Computational engine: stats

Model tuning

set.seed(4566)
hc_res <- tune_cluster(
  hc_wf,
  resamples = wine_folds,
  grid = grid,
  metrics = cluster_metric_set(silhouette_avg)
)
  
hc_res
## # Tuning results
## # 10-fold cross-validation 
## # A tibble: 10 × 4
##    splits           id     .metrics          .notes          
##    <list>           <chr>  <list>            <list>          
##  1 <split [119/14]> Fold01 <tibble [10 × 5]> <tibble [0 × 3]>
##  2 <split [119/14]> Fold02 <tibble [10 × 5]> <tibble [0 × 3]>
##  3 <split [119/14]> Fold03 <tibble [10 × 5]> <tibble [0 × 3]>
##  4 <split [120/13]> Fold04 <tibble [10 × 5]> <tibble [0 × 3]>
##  5 <split [120/13]> Fold05 <tibble [10 × 5]> <tibble [0 × 3]>
##  6 <split [120/13]> Fold06 <tibble [10 × 5]> <tibble [0 × 3]>
##  7 <split [120/13]> Fold07 <tibble [10 × 5]> <tibble [0 × 3]>
##  8 <split [120/13]> Fold08 <tibble [10 × 5]> <tibble [0 × 3]>
##  9 <split [120/13]> Fold09 <tibble [10 × 5]> <tibble [0 × 3]>
## 10 <split [120/13]> Fold10 <tibble [10 × 5]> <tibble [0 × 3]>
hc_res %>% 
  collect_metrics()
## # A tibble: 10 × 7
##    num_clusters .metric        .estimator      mean     n  std_err .config      
##           <int> <chr>          <chr>          <dbl> <int>    <dbl> <chr>        
##  1            1 silhouette_avg standard   NaN           0 NA       Preprocessor…
##  2            2 silhouette_avg standard     0.241      10  0.00704 Preprocessor…
##  3            3 silhouette_avg standard     0.275      10  0.00274 Preprocessor…
##  4            4 silhouette_avg standard    -0.00901    10  0.0444  Preprocessor…
##  5            5 silhouette_avg standard    -0.0797     10  0.0302  Preprocessor…
##  6            6 silhouette_avg standard    -0.107      10  0.0229  Preprocessor…
##  7            7 silhouette_avg standard    -0.0626     10  0.0171  Preprocessor…
##  8            8 silhouette_avg standard    -0.00273    10  0.0136  Preprocessor…
##  9            9 silhouette_avg standard    -0.0270     10  0.0192  Preprocessor…
## 10           10 silhouette_avg standard    -0.0504     10  0.0226  Preprocessor…
# Visualizing the result
hc_res %>% 
  collect_metrics() %>% 
  ggplot(aes(num_clusters, mean)) +
  geom_point() +
  geom_line() +
  theme_minimal() +
  ylab("mean silhoutte score") +
  xlab("Number of clusters") +
  scale_x_continuous(breaks = 1:10) 

After fitting the hierarchical model, the silhouette average score obtained was 0.27, corresponding to 3 clusters. This is consistent with the k-means model, which also identified 3 clusters as optimal.

# Finalize the workflow and fit the model
best_cluster <- tibble(num_clusters = 3)

hc_final <- finalize_workflow_tidyclust(hc_wf, best_cluster)

hc_fit <- fit(hc_final, data = wine_train)
hc_fit
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: hier_clust()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
## 
## • step_zv()
## • step_normalize()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## 
## Call:
## stats::hclust(d = stats::as.dist(dmat), method = linkage_method)
## 
## Cluster method   : ward.D2 
## Number of objects: 133
# Cluster assignments
extract_cluster_assignment(hc_fit) %>% 
  table()
## .cluster
## Cluster_1 Cluster_2 Cluster_3 
##        36        42        55

Plot the dendrogram

hc_model <- extract_fit_parsnip(hc_fit)$fit

library(dendextend)
fviz_dend(hc_model, k = 3, horiz = TRUE,
          rect = TRUE, rect_fill = TRUE, rect_border = "jco", 
          k_colors = "jco", cex = 0.1)

Principal Component Analysis

  • Step 2: Addition of Principal Component Analysis (pca) step to the Kmeans model to see if it improves performance
  • PCA - is a dimensionality reduction technique used to transform the original high-dimensional data into a lower-dimensional space while retaining most of the variance (important information) in the data.
  • It can simplify the data structure, highlight underlying patterns, and make it easier to identify distinct clusters.

Basic Recipe

basic_rec <- recipe(~., data = wine_train) %>% 
  step_zv(all_predictors()) %>% 
  step_normalize(all_numeric_predictors())

wine_processed <- prep(basic_rec) %>% bake(new_data = wine_test)
wine_processed
## # A tibble: 45 × 13
##    Alcohol Malic_Acid     Ash Ash_Alcanity Magnesium Total_Phenols Flavanoids
##      <dbl>      <dbl>   <dbl>        <dbl>     <dbl>         <dbl>      <dbl>
##  1   1.46    -0.569    0.164       -1.19      1.85           0.806      1.02 
##  2   0.190   -0.507   -0.908       -2.49      0.0204         0.571      0.731
##  3   0.141    0.00733  1.05        -0.301     0.0883         0.806      1.20 
##  4   1.00    -0.887   -0.428       -1.07     -0.115          1.09       1.11 
##  5   1.65    -0.427   -0.0211      -2.26      0.156          1.59       1.58 
##  6   0.967   -0.693    0.866        0.114     1.04           1.04       1.35 
##  7   0.733    0.663    0.644       -1.31      1.11           0.650      0.993
##  8   0.314   -0.560   -0.908       -0.776    -0.387          0.179      0.178
##  9   1.02    -0.400    1.53        -0.0642    0.496          1.04       0.934
## 10   0.782   -0.462   -0.0951      -0.716     0.292          0.210      0.663
## # ℹ 35 more rows
## # ℹ 6 more variables: Nonflavanoid_Phenols <dbl>, Proanthocyanins <dbl>,
## #   Color_Intensity <dbl>, Hue <dbl>, OD280 <dbl>, Proline <dbl>
# Function that helps in implementing PCA
library(ggforce)
plot_test2 <- function(recipe, dat = wine_test){
  recipe %>% 
    prep() %>% 
    bake(new_data = dat) %>% 
    ggplot() +
    geom_autopoint(aes(color = "steelblue"), alpha = 0.5, size = 1) +
    geom_autodensity(alpha = 0.3) +
    facet_matrix(rows = vars(PC1, PC2, PC3,PC4), layer.diag = 2)   # paste0("PC",1:4)
}
# PCA
basic_rec %>% 
  step_pca(all_numeric_predictors(), num_comp = 4) %>% 
  plot_test2() +
  theme_bw()

PC1 and PC2 captures small clusters in the data, this indicates that the data points in the dataset might share some similarities

# What features are driving performance?

basic_rec %>% 
  step_pca(all_numeric_predictors(), num_comp = 4) %>% 
  prep() %>% 
  tidy(number = 3) %>% 
  filter(component %in% paste0("PC",1:4)) %>% 
  group_by(component) %>% 
  slice_max(abs(value), n = 5) %>% 
  ungroup() %>% 
  ggplot(aes(abs(value), terms, fill = value > 0)) +
  geom_col(position = "dodge", alpha = 0.7) +
  facet_wrap(~component, scales = "free_y") +
  labs(x = "Contribution to Principal Component", y = NULL, fill = "Positive?") +
  theme_bw()

# Fit the model, including step_pca
pca_rec <- basic_rec %>% 
  step_pca(all_numeric_predictors(), num_comp = 4)
# Workflow
kmeans_wf_pca <- workflow() %>% 
  add_recipe(pca_rec) %>% 
  add_model(kmeans_spec)
# Tuning
set.seed(3456)
wine_res_pca <- tune_cluster(
  kmeans_wf_pca,
  resamples = wine_folds,
  grid = grid,
  metrics = cluster_metric_set(silhouette_avg))

wine_res_pca
## # Tuning results
## # 10-fold cross-validation 
## # A tibble: 10 × 4
##    splits           id     .metrics          .notes          
##    <list>           <chr>  <list>            <list>          
##  1 <split [119/14]> Fold01 <tibble [10 × 5]> <tibble [0 × 3]>
##  2 <split [119/14]> Fold02 <tibble [10 × 5]> <tibble [0 × 3]>
##  3 <split [119/14]> Fold03 <tibble [10 × 5]> <tibble [0 × 3]>
##  4 <split [120/13]> Fold04 <tibble [10 × 5]> <tibble [0 × 3]>
##  5 <split [120/13]> Fold05 <tibble [10 × 5]> <tibble [0 × 3]>
##  6 <split [120/13]> Fold06 <tibble [10 × 5]> <tibble [0 × 3]>
##  7 <split [120/13]> Fold07 <tibble [10 × 5]> <tibble [0 × 3]>
##  8 <split [120/13]> Fold08 <tibble [10 × 5]> <tibble [0 × 3]>
##  9 <split [120/13]> Fold09 <tibble [10 × 5]> <tibble [0 × 3]>
## 10 <split [120/13]> Fold10 <tibble [10 × 5]> <tibble [0 × 3]>
wine_res_pca %>% 
  collect_metrics()
## # A tibble: 10 × 7
##    num_clusters .metric        .estimator    mean     n  std_err .config        
##           <int> <chr>          <chr>        <dbl> <int>    <dbl> <chr>          
##  1            1 silhouette_avg standard   NaN         0 NA       Preprocessor1_…
##  2            2 silhouette_avg standard     0.342    10  0.0152  Preprocessor1_…
##  3            3 silhouette_avg standard     0.402    10  0.00249 Preprocessor1_…
##  4            4 silhouette_avg standard     0.351    10  0.00541 Preprocessor1_…
##  5            5 silhouette_avg standard     0.333    10  0.0118  Preprocessor1_…
##  6            6 silhouette_avg standard     0.295    10  0.00907 Preprocessor1_…
##  7            7 silhouette_avg standard     0.285    10  0.00640 Preprocessor1_…
##  8            8 silhouette_avg standard     0.264    10  0.00819 Preprocessor1_…
##  9            9 silhouette_avg standard     0.271    10  0.00318 Preprocessor1_…
## 10           10 silhouette_avg standard     0.253    10  0.00688 Preprocessor1_…
# Visualizing the results
wine_res_pca %>% 
  collect_metrics() %>% 
  select(num_clusters, mean) %>% 
  ggplot(aes(x = num_clusters, y = mean)) +
  geom_point() +
  geom_line() +
  theme_minimal() +
  ylab("mean silhoutte score") +
  xlab("Number of clusters") +
  scale_x_continuous(breaks = 1:10)

After adding a PCA step to the k-means model, the silhouette average score improved to 0.40, with 3 still being the optimal number of clusters.

wines_final_wf_pca <- finalize_workflow_tidyclust(kmeans_wf_pca, best_cluster)
wines_final_wf_pca
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: k_means()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
## 
## • step_zv()
## • step_normalize()
## • step_pca()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## K Means Cluster Specification (partition)
## 
## Main Arguments:
##   num_clusters = 3
## 
## Computational engine: stats
# Fit the model
wines_fit_pca <- fit(wines_final_wf_pca, data = wine_train)
wines_fit_pca
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: k_means()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
## 
## • step_zv()
## • step_normalize()
## • step_pca()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## K-means clustering with 3 clusters of sizes 38, 52, 43
## 
## Cluster means:
##          PC1        PC2          PC3         PC4
## 2  2.6442278 -1.2106895 -0.157634206  0.07725296
## 1 -2.2816279 -0.6278453 -0.004578984 -0.07358049
## 3  0.4224185  1.8291664  0.144842024  0.02071100
## 
## Clustering vector:
##   [1] 1 2 1 2 3 2 1 1 1 3 3 1 3 1 2 2 2 2 1 3 3 1 3 3 3 1 2 3 1 1 2 2 2 1 1 2 3
##  [38] 1 1 3 3 1 2 2 2 1 3 2 3 3 2 3 1 3 3 2 2 3 1 1 1 3 1 2 2 1 2 2 2 3 1 3 3 2
##  [75] 2 2 2 2 1 3 1 3 2 3 2 1 2 3 2 2 2 1 3 3 1 2 2 3 3 2 3 2 1 1 1 3 1 2 2 2 1
## [112] 3 3 1 2 2 2 2 3 2 3 3 1 1 2 3 3 2 3 2 2 2 3
## 
## Within cluster sum of squares by cluster:
## [1] 104.8761 213.9112 188.4367
##  (between_SS / total_SS =  60.2 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
# Visualize the clusters
kmeans_model_pca <- extract_fit_parsnip(wines_fit_pca)$fit

fviz_cluster(kmeans_model_pca, data = prep(pca_rec) %>% bake(new_data = NULL),
             geom = "point",
             ellipse.type = "circle",
             palette = "jco",
             ggtheme = theme_minimal())